#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling #need to install using anaconda prompt (pip install pandas_profiling)
import sklearn
import statsmodels
import scipy.stats as stats
import os
import datetime as dt
import itertools
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
#importing dataset
data = pd.read_excel('Data Set.xlsx')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Columns: 130 entries, custid to response_03 dtypes: float64(31), int64(97), object(2) memory usage: 5.0+ MB
data.head()
| custid | region | townsize | gender | age | agecat | birthmonth | ed | edcat | jobcat | ... | owncd | ownpda | ownpc | ownipod | owngame | ownfax | news | response_01 | response_02 | response_03 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3964-QJWTRG-NPN | 1 | 2.0 | 1 | 20 | 2 | September | 15 | 3 | 1 | ... | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0648-AIPJSP-UVM | 5 | 5.0 | 0 | 22 | 2 | May | 17 | 4 | 2 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 |
| 2 | 5195-TLUDJE-HVO | 3 | 4.0 | 1 | 67 | 6 | June | 14 | 2 | 2 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 4459-VLPQUH-3OL | 4 | 3.0 | 0 | 23 | 2 | May | 16 | 3 | 2 | ... | 1 | 0 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 0 |
| 4 | 8158-SMTQFB-CNO | 2 | 2.0 | 0 | 26 | 3 | July | 16 | 3 | 2 | ... | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
5 rows × 130 columns
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| region | 5000.0 | 3.001400 | 1.421760 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 |
| townsize | 4998.0 | 2.687275 | 1.425925 | 1.000000 | 1.000000 | 3.000000 | 4.000000 | 5.000000 |
| gender | 5000.0 | 0.503600 | 0.500037 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| age | 5000.0 | 47.025600 | 17.770338 | 18.000000 | 31.000000 | 47.000000 | 62.000000 | 79.000000 |
| agecat | 5000.0 | 4.238800 | 1.308785 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 6.000000 |
| ed | 5000.0 | 14.543000 | 3.281083 | 6.000000 | 12.000000 | 14.000000 | 17.000000 | 23.000000 |
| edcat | 5000.0 | 2.672000 | 1.211738 | 1.000000 | 2.000000 | 2.000000 | 4.000000 | 5.000000 |
| jobcat | 5000.0 | 2.752800 | 1.737900 | 1.000000 | 1.000000 | 2.000000 | 4.000000 | 6.000000 |
| union | 5000.0 | 0.151200 | 0.358280 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| employ | 5000.0 | 9.730400 | 9.690929 | 0.000000 | 2.000000 | 7.000000 | 15.000000 | 52.000000 |
| empcat | 5000.0 | 2.932600 | 1.453300 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 |
| retire | 5000.0 | 0.147600 | 0.354739 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| income | 5000.0 | 54.759600 | 55.377511 | 9.000000 | 24.000000 | 38.000000 | 67.000000 | 1073.000000 |
| lninc | 5000.0 | 3.699909 | 0.747072 | 2.197225 | 3.178054 | 3.637586 | 4.204693 | 6.978214 |
| inccat | 5000.0 | 2.392200 | 1.221261 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 5.000000 |
| debtinc | 5000.0 | 9.954160 | 6.399783 | 0.000000 | 5.100000 | 8.800000 | 13.600000 | 43.100000 |
| creddebt | 5000.0 | 1.857326 | 3.415732 | 0.000000 | 0.385520 | 0.926437 | 2.063820 | 109.072596 |
| lncreddebt | 4999.0 | -0.130454 | 1.273058 | -6.597334 | -0.952685 | -0.076106 | 0.724665 | 4.692014 |
| othdebt | 5000.0 | 3.654460 | 5.395172 | 0.000000 | 0.980301 | 2.098540 | 4.314780 | 141.459150 |
| lnothdebt | 4999.0 | 0.696915 | 1.128578 | -4.092107 | -0.018987 | 0.741537 | 1.462053 | 4.952011 |
| default | 5000.0 | 0.234200 | 0.423540 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| jobsat | 5000.0 | 2.964200 | 1.379456 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 |
| marital | 5000.0 | 0.480200 | 0.499658 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| spoused | 5000.0 | 6.112800 | 7.743518 | -1.000000 | -1.000000 | -1.000000 | 14.000000 | 24.000000 |
| spousedcat | 5000.0 | 0.641400 | 1.886775 | -1.000000 | -1.000000 | -1.000000 | 2.000000 | 5.000000 |
| reside | 5000.0 | 2.204000 | 1.393977 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 9.000000 |
| pets | 5000.0 | 3.067400 | 3.414497 | 0.000000 | 0.000000 | 2.000000 | 5.000000 | 21.000000 |
| pets_cats | 5000.0 | 0.500400 | 0.860783 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 6.000000 |
| pets_dogs | 5000.0 | 0.392400 | 0.796084 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 7.000000 |
| pets_birds | 5000.0 | 0.110400 | 0.494227 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| cardten | 4998.0 | 720.478391 | 922.225527 | 0.000000 | 0.000000 | 425.000000 | 1080.000000 | 13705.000000 |
| lncardten | 3578.0 | 6.426309 | 1.172050 | 1.558145 | 5.857933 | 6.639876 | 7.218910 | 9.525516 |
| wireless | 5000.0 | 0.268800 | 0.443380 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| wiremon | 5000.0 | 10.701190 | 19.799837 | 0.000000 | 0.000000 | 0.000000 | 20.962500 | 186.250000 |
| lnwiremon | 1344.0 | 3.605001 | 0.390102 | 2.541602 | 3.330417 | 3.597997 | 3.865193 | 5.227090 |
| wireten | 5000.0 | 421.984610 | 1001.003287 | 0.000000 | 0.000000 | 0.000000 | 89.962500 | 12858.650000 |
| lnwireten | 1344.0 | 6.808132 | 1.283967 | 2.541602 | 6.158091 | 7.147185 | 7.755376 | 9.461772 |
| multline | 5000.0 | 0.488400 | 0.499915 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| voice | 5000.0 | 0.303000 | 0.459601 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| pager | 5000.0 | 0.243600 | 0.429297 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| internet | 5000.0 | 1.199600 | 1.449338 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 4.000000 |
| callid | 5000.0 | 0.475200 | 0.499435 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| callwait | 5000.0 | 0.479000 | 0.499609 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| forward | 5000.0 | 0.480600 | 0.499673 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| confer | 5000.0 | 0.478000 | 0.499566 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| ebill | 5000.0 | 0.348600 | 0.476575 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| owntv | 5000.0 | 0.983000 | 0.129284 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| hourstv | 5000.0 | 19.645000 | 5.165609 | 0.000000 | 17.000000 | 20.000000 | 23.000000 | 36.000000 |
| ownvcr | 5000.0 | 0.915600 | 0.278015 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owndvd | 5000.0 | 0.913600 | 0.280982 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owncd | 5000.0 | 0.932800 | 0.250393 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownpda | 5000.0 | 0.201000 | 0.400788 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| ownpc | 5000.0 | 0.632800 | 0.482090 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownipod | 5000.0 | 0.479200 | 0.499617 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| owngame | 5000.0 | 0.474800 | 0.499415 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| ownfax | 5000.0 | 0.178800 | 0.383223 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| news | 5000.0 | 0.472600 | 0.499299 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| response_01 | 5000.0 | 0.083600 | 0.276815 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| response_02 | 5000.0 | 0.129800 | 0.336117 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| response_03 | 5000.0 | 0.102600 | 0.303466 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
128 rows × 8 columns
data['Total_Spend']=data.cardspent+data.card2spent
data.drop(['cardspent','card2spent'],axis=1,inplace=True)
pandas_profiling.ProfileReport(data).to_file('Pandas Profiling')
pandas_profiling.ProfileReport(data)
Dropping Variables - address,age,cardtenure,card2tenure,card2tenurecat,commute,ed,equipmon,lncardmon,lncardten,lnequipmon, lnequipten,lninc,lnlongmon,lntollmon,lntollten,longten,spoused,spousedcat,lnlongten,lnwiremon
data.drop(['address','age','cardtenure','card2tenure','card2tenurecat','commute','ed','equipmon','lncardmon','lncardten','lnequipmon','lnequipten','lninc','lnlongmon','lntollmon','lntollten','longten','spoused','spousedcat','lnlongten','lnwiremon'],axis=1,inplace=True)
dtype_df = data.dtypes.reset_index().rename(columns={0:'datatype','index':'col_name'})
dtype_df
| col_name | datatype | |
|---|---|---|
| 0 | custid | object |
| 1 | region | int64 |
| 2 | townsize | float64 |
| 3 | gender | int64 |
| 4 | agecat | int64 |
| 5 | birthmonth | object |
| 6 | edcat | int64 |
| 7 | jobcat | int64 |
| 8 | union | int64 |
| 9 | employ | int64 |
| 10 | empcat | int64 |
| 11 | retire | int64 |
| 12 | income | int64 |
| 13 | inccat | int64 |
| 14 | debtinc | float64 |
| 15 | creddebt | float64 |
| 16 | lncreddebt | float64 |
| 17 | othdebt | float64 |
| 18 | lnothdebt | float64 |
| 19 | default | int64 |
| 20 | jobsat | int64 |
| 21 | marital | int64 |
| 22 | reside | int64 |
| 23 | pets | int64 |
| 24 | pets_cats | int64 |
| 25 | pets_dogs | int64 |
| 26 | pets_birds | int64 |
| 27 | pets_reptiles | int64 |
| 28 | pets_small | int64 |
| 29 | pets_saltfish | int64 |
| ... | ... | ... |
| 78 | cardmon | float64 |
| 79 | cardten | float64 |
| 80 | wireless | int64 |
| 81 | wiremon | float64 |
| 82 | wireten | float64 |
| 83 | lnwireten | float64 |
| 84 | multline | int64 |
| 85 | voice | int64 |
| 86 | pager | int64 |
| 87 | internet | int64 |
| 88 | callid | int64 |
| 89 | callwait | int64 |
| 90 | forward | int64 |
| 91 | confer | int64 |
| 92 | ebill | int64 |
| 93 | owntv | int64 |
| 94 | hourstv | int64 |
| 95 | ownvcr | int64 |
| 96 | owndvd | int64 |
| 97 | owncd | int64 |
| 98 | ownpda | int64 |
| 99 | ownpc | int64 |
| 100 | ownipod | int64 |
| 101 | owngame | int64 |
| 102 | ownfax | int64 |
| 103 | news | int64 |
| 104 | response_01 | int64 |
| 105 | response_02 | int64 |
| 106 | response_03 | int64 |
| 107 | Total_Spend | float64 |
108 rows × 2 columns
# Seperating num and cat data(as we have to perform different operations while preparing them)
num_data = data.loc[:,list(dtype_df.col_name[(dtype_df.datatype == 'float64')|(dtype_df.datatype == 'int64')|(dtype_df.datatype == 'float32')|(dtype_df.datatype == 'int32')])]
num_data.info()
print('---------------')
cat_data = data.loc[:,list(dtype_df.col_name[(dtype_df.datatype == 'object')|(dtype_df.datatype == 'O')])]
cat_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Columns: 106 entries, region to Total_Spend dtypes: float64(18), int64(88) memory usage: 4.0 MB --------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 2 columns): custid 5000 non-null object birthmonth 5000 non-null object dtypes: object(2) memory usage: 78.2+ KB
# Audit report of numerical data
def num_data_summary(x):
return pd.Series([x.count(),x.isnull().sum(),x.sum(),x.median(),x.mean(),x.std(),x.std()/x.mean(),x.var(),x.min(),x.quantile(0.01),x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75),x.quantile(0.90),x.quantile(0.95),x.quantile(0.99),x.max()],
index= ['Non_null','Null','Sum','Median','Mean','Std_Dev','CV','Variance','Min','P1','P10','P25','P50','P75','P90','P95','P99','Max'])
num_data_sum =num_data.apply(num_data_summary)
num_data_sum.to_csv('num_data_sum.csv')
num_data_sum.T
| Non_null | Null | Sum | Median | Mean | Std_Dev | CV | Variance | Min | P1 | P10 | P25 | P50 | P75 | P90 | P95 | P99 | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| region | 5000.0 | 0.0 | 1.500700e+04 | 3.000000 | 3.001400 | 1.421760 | 0.473699 | 2.021402e+00 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
| townsize | 4998.0 | 2.0 | 1.343100e+04 | 3.000000 | 2.687275 | 1.425925 | 0.530621 | 2.033262e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
| gender | 5000.0 | 0.0 | 2.518000e+03 | 1.000000 | 0.503600 | 0.500037 | 0.992925 | 2.500370e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| agecat | 5000.0 | 0.0 | 2.119400e+04 | 4.000000 | 4.238800 | 1.308785 | 0.308763 | 1.712917e+00 | 2.000000 | 2.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
| edcat | 5000.0 | 0.0 | 1.336000e+04 | 2.000000 | 2.672000 | 1.211738 | 0.453495 | 1.468310e+00 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 4.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 |
| jobcat | 5000.0 | 0.0 | 1.376400e+04 | 2.000000 | 2.752800 | 1.737900 | 0.631321 | 3.020296e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 4.000000 | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
| union | 5000.0 | 0.0 | 7.560000e+02 | 0.000000 | 0.151200 | 0.358280 | 2.369574 | 1.283642e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| employ | 5000.0 | 0.0 | 4.865200e+04 | 7.000000 | 9.730400 | 9.690929 | 0.995944 | 9.391410e+01 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 7.000000 | 15.000000 | 25.000000 | 31.000000 | 39.000000 | 52.000000 |
| empcat | 5000.0 | 0.0 | 1.466300e+04 | 3.000000 | 2.932600 | 1.453300 | 0.495567 | 2.112080e+00 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
| retire | 5000.0 | 0.0 | 7.380000e+02 | 0.000000 | 0.147600 | 0.354739 | 2.403377 | 1.258394e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| income | 5000.0 | 0.0 | 2.737980e+05 | 38.000000 | 54.759600 | 55.377511 | 1.011284 | 3.066669e+03 | 9.000000 | 9.000000 | 16.000000 | 24.000000 | 38.000000 | 67.000000 | 109.100000 | 147.000000 | 272.010000 | 1073.000000 |
| inccat | 5000.0 | 0.0 | 1.196100e+04 | 2.000000 | 2.392200 | 1.221261 | 0.510518 | 1.491477e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 |
| debtinc | 5000.0 | 0.0 | 4.977080e+04 | 8.800000 | 9.954160 | 6.399783 | 0.642925 | 4.095723e+01 | 0.000000 | 0.700000 | 2.800000 | 5.100000 | 8.800000 | 13.600000 | 18.600000 | 22.200000 | 29.200000 | 43.100000 |
| creddebt | 5000.0 | 0.0 | 9.286628e+03 | 0.926437 | 1.857326 | 3.415732 | 1.839059 | 1.166722e+01 | 0.000000 | 0.033160 | 0.175682 | 0.385520 | 0.926437 | 2.063820 | 4.299470 | 6.373010 | 14.280358 | 109.072596 |
| lncreddebt | 4999.0 | 1.0 | -6.521372e+02 | -0.076106 | -0.130454 | 1.273058 | -9.758712 | 1.620678e+00 | -6.597334 | -3.401690 | -1.737842 | -0.952685 | -0.076106 | 0.724665 | 1.458625 | 1.852297 | 2.658910 | 4.692014 |
| othdebt | 5000.0 | 0.0 | 1.827230e+04 | 2.098540 | 3.654460 | 5.395172 | 1.476325 | 2.910788e+01 | 0.000000 | 0.114299 | 0.457997 | 0.980301 | 2.098540 | 4.314780 | 8.062046 | 11.815981 | 24.064260 | 141.459150 |
| lnothdebt | 4999.0 | 1.0 | 3.483879e+03 | 0.741537 | 0.696915 | 1.128578 | 1.619391 | 1.273689e+00 | -4.092107 | -2.168241 | -0.780312 | -0.018987 | 0.741537 | 1.462053 | 2.087178 | 2.469586 | 3.180802 | 4.952011 |
| default | 5000.0 | 0.0 | 1.171000e+03 | 0.000000 | 0.234200 | 0.423540 | 1.808455 | 1.793862e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| jobsat | 5000.0 | 0.0 | 1.482100e+04 | 3.000000 | 2.964200 | 1.379456 | 0.465372 | 1.902899e+00 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
| marital | 5000.0 | 0.0 | 2.401000e+03 | 0.000000 | 0.480200 | 0.499658 | 1.040520 | 2.496579e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| reside | 5000.0 | 0.0 | 1.102000e+04 | 2.000000 | 2.204000 | 1.393977 | 0.632476 | 1.943173e+00 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 4.000000 | 5.000000 | 6.000000 | 9.000000 |
| pets | 5000.0 | 0.0 | 1.533700e+04 | 2.000000 | 3.067400 | 3.414497 | 1.113157 | 1.165879e+01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 5.000000 | 8.000000 | 10.000000 | 13.000000 | 21.000000 |
| pets_cats | 5000.0 | 0.0 | 2.502000e+03 | 0.000000 | 0.500400 | 0.860783 | 1.720191 | 7.409480e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 6.000000 |
| pets_dogs | 5000.0 | 0.0 | 1.962000e+03 | 0.000000 | 0.392400 | 0.796084 | 2.028755 | 6.337490e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 3.000000 | 7.000000 |
| pets_birds | 5000.0 | 0.0 | 5.520000e+02 | 0.000000 | 0.110400 | 0.494227 | 4.476697 | 2.442607e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 5.000000 |
| pets_reptiles | 5000.0 | 0.0 | 2.780000e+02 | 0.000000 | 0.055600 | 0.325776 | 5.859277 | 1.061299e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 6.000000 |
| pets_small | 5000.0 | 0.0 | 5.730000e+02 | 0.000000 | 0.114600 | 0.568798 | 4.963336 | 3.235315e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 3.000000 | 7.000000 |
| pets_saltfish | 5000.0 | 0.0 | 2.330000e+02 | 0.000000 | 0.046600 | 0.469545 | 10.076074 | 2.204725e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 8.000000 |
| pets_freshfish | 5000.0 | 0.0 | 9.237000e+03 | 0.000000 | 1.847400 | 3.074801 | 1.664394 | 9.454404e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 7.000000 | 8.000000 | 11.000000 | 16.000000 |
| homeown | 5000.0 | 0.0 | 3.148000e+03 | 1.000000 | 0.629600 | 0.482960 | 0.767090 | 2.332505e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| cardmon | 5000.0 | 0.0 | 7.721925e+04 | 13.750000 | 15.443850 | 15.007569 | 0.971751 | 2.252271e+02 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 13.750000 | 22.750000 | 34.000000 | 42.000000 | 64.250000 | 188.500000 |
| cardten | 4998.0 | 2.0 | 3.600951e+06 | 425.000000 | 720.478391 | 922.225527 | 1.280018 | 8.504999e+05 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 425.000000 | 1080.000000 | 1871.500000 | 2455.750000 | 4011.200000 | 13705.000000 |
| wireless | 5000.0 | 0.0 | 1.344000e+03 | 0.000000 | 0.268800 | 0.443380 | 1.649479 | 1.965859e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| wiremon | 5000.0 | 0.0 | 5.350595e+04 | 0.000000 | 10.701190 | 19.799837 | 1.850246 | 3.920335e+02 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 20.962500 | 40.860000 | 51.305000 | 78.304000 | 186.250000 |
| wireten | 5000.0 | 0.0 | 2.109923e+06 | 0.000000 | 421.984610 | 1001.003287 | 2.372132 | 1.002008e+06 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 89.962500 | 1778.535000 | 2687.922500 | 4530.186000 | 12858.650000 |
| lnwireten | 1344.0 | 3656.0 | 9.150129e+03 | 7.147185 | 6.808132 | 1.283967 | 0.188593 | 1.648571e+00 | 2.541602 | 3.039749 | 4.881065 | 6.158091 | 7.147185 | 7.755376 | 8.106616 | 8.310817 | 8.690117 | 9.461772 |
| multline | 5000.0 | 0.0 | 2.442000e+03 | 0.000000 | 0.488400 | 0.499915 | 1.023578 | 2.499154e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| voice | 5000.0 | 0.0 | 1.515000e+03 | 0.000000 | 0.303000 | 0.459601 | 1.516836 | 2.112332e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| pager | 5000.0 | 0.0 | 1.218000e+03 | 0.000000 | 0.243600 | 0.429297 | 1.762303 | 1.842959e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| internet | 5000.0 | 0.0 | 5.998000e+03 | 1.000000 | 1.199600 | 1.449338 | 1.208184 | 2.100580e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 |
| callid | 5000.0 | 0.0 | 2.376000e+03 | 0.000000 | 0.475200 | 0.499435 | 1.050999 | 2.494348e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| callwait | 5000.0 | 0.0 | 2.395000e+03 | 0.000000 | 0.479000 | 0.499609 | 1.043025 | 2.496089e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| forward | 5000.0 | 0.0 | 2.403000e+03 | 0.000000 | 0.480600 | 0.499673 | 1.039687 | 2.496736e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| confer | 5000.0 | 0.0 | 2.390000e+03 | 0.000000 | 0.478000 | 0.499566 | 1.045117 | 2.495659e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ebill | 5000.0 | 0.0 | 1.743000e+03 | 0.000000 | 0.348600 | 0.476575 | 1.367111 | 2.271235e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owntv | 5000.0 | 0.0 | 4.915000e+03 | 1.000000 | 0.983000 | 0.129284 | 0.131520 | 1.671434e-02 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| hourstv | 5000.0 | 0.0 | 9.822500e+04 | 20.000000 | 19.645000 | 5.165609 | 0.262948 | 2.668351e+01 | 0.000000 | 0.000000 | 14.000000 | 17.000000 | 20.000000 | 23.000000 | 26.000000 | 28.000000 | 31.000000 | 36.000000 |
| ownvcr | 5000.0 | 0.0 | 4.578000e+03 | 1.000000 | 0.915600 | 0.278015 | 0.303642 | 7.729210e-02 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owndvd | 5000.0 | 0.0 | 4.568000e+03 | 1.000000 | 0.913600 | 0.280982 | 0.307555 | 7.895083e-02 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owncd | 5000.0 | 0.0 | 4.664000e+03 | 1.000000 | 0.932800 | 0.250393 | 0.268432 | 6.269670e-02 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownpda | 5000.0 | 0.0 | 1.005000e+03 | 0.000000 | 0.201000 | 0.400788 | 1.993971 | 1.606311e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownpc | 5000.0 | 0.0 | 3.164000e+03 | 1.000000 | 0.632800 | 0.482090 | 0.761836 | 2.324106e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownipod | 5000.0 | 0.0 | 2.396000e+03 | 0.000000 | 0.479200 | 0.499617 | 1.042607 | 2.496173e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| owngame | 5000.0 | 0.0 | 2.374000e+03 | 0.000000 | 0.474800 | 0.499415 | 1.051842 | 2.494148e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| ownfax | 5000.0 | 0.0 | 8.940000e+02 | 0.000000 | 0.178800 | 0.383223 | 2.143306 | 1.468599e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| news | 5000.0 | 0.0 | 2.363000e+03 | 0.000000 | 0.472600 | 0.499299 | 1.056493 | 2.492991e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| response_01 | 5000.0 | 0.0 | 4.180000e+02 | 0.000000 | 0.083600 | 0.276815 | 3.311180 | 7.662637e-02 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| response_02 | 5000.0 | 0.0 | 6.490000e+02 | 0.000000 | 0.129800 | 0.336117 | 2.589498 | 1.129746e-01 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| response_03 | 5000.0 | 0.0 | 5.130000e+02 | 0.000000 | 0.102600 | 0.303466 | 2.957759 | 9.209166e-02 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| Total_Spend | 5000.0 | 0.0 | 2.490393e+06 | 414.250000 | 498.078630 | 351.529270 | 0.705771 | 1.235728e+05 | 8.110000 | 58.197600 | 184.033000 | 276.282500 | 414.250000 | 615.562500 | 908.125000 | 1145.146500 | 1760.102400 | 4881.050000 |
106 rows × 18 columns
print(num_data.columns.tolist())
['region', 'townsize', 'gender', 'agecat', 'edcat', 'jobcat', 'union', 'employ', 'empcat', 'retire', 'income', 'inccat', 'debtinc', 'creddebt', 'lncreddebt', 'othdebt', 'lnothdebt', 'default', 'jobsat', 'marital', 'reside', 'pets', 'pets_cats', 'pets_dogs', 'pets_birds', 'pets_reptiles', 'pets_small', 'pets_saltfish', 'pets_freshfish', 'homeown', 'hometype', 'addresscat', 'cars', 'carown', 'cartype', 'carvalue', 'carcatvalue', 'carbought', 'carbuy', 'commutecat', 'commutetime', 'commutecar', 'commutemotorcycle', 'commutecarpool', 'commutebus', 'commuterail', 'commutepublic', 'commutebike', 'commutewalk', 'commutenonmotor', 'telecommute', 'reason', 'polview', 'polparty', 'polcontrib', 'vote', 'card', 'cardtype', 'cardbenefit', 'cardfee', 'cardtenurecat', 'card2', 'card2type', 'card2benefit', 'card2fee', 'active', 'bfast', 'tenure', 'churn', 'longmon', 'tollfree', 'tollmon', 'tollten', 'equip', 'equipten', 'callcard', 'cardmon', 'cardten', 'wireless', 'wiremon', 'wireten', 'lnwireten', 'multline', 'voice', 'pager', 'internet', 'callid', 'callwait', 'forward', 'confer', 'ebill', 'owntv', 'hourstv', 'ownvcr', 'owndvd', 'owncd', 'ownpda', 'ownpc', 'ownipod', 'owngame', 'ownfax', 'news', 'response_01', 'response_02', 'response_03', 'Total_Spend']
# out of numerical varibles region needs to dummy encoded
dummy = pd.get_dummies(num_data['region'],prefix='region',drop_first=True) # drop_first=True to avoid dummy variable trap
num_data = num_data.drop(['region'],axis=1)
num_data = pd.concat([num_data,dummy],axis=1)
# Missing value and outliers of num data
def missing_replacing_num(x):
return x.fillna(x.median())
def outiers_capping(x):
return x.clip(x.quantile(0.01),x.quantile(0.99))
num_data_modified = num_data.apply(missing_replacing_num)
num_data_modified = num_data_modified.apply(outiers_capping)
num_data_modified.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Columns: 109 entries, townsize to region_5 dtypes: float64(19), int64(86), uint8(4) memory usage: 4.0 MB
# Audit report of categorical data
def cat_data_summary(x):
return pd.Series([x.count(),x.isnull().sum(),len(x.unique())],index=['Not_Null','Null','Unique'])
cat_data_sum = cat_data.apply(cat_data_summary)
cat_data_sum.to_csv('cat_data_sum.csv')
cat_data_sum
| custid | birthmonth | |
|---|---|---|
| Not_Null | 5000 | 5000 |
| Null | 0 | 0 |
| Unique | 5000 | 12 |
# custid is of no use
cat_data = cat_data.drop(['custid'],axis =1)
# dummy encoding birthmonth
dummy = pd.get_dummies(cat_data['birthmonth'],prefix='birthmonth',drop_first=True) # drop_first=True to avoid dummy variable trap
cat_data = cat_data.drop(['birthmonth'],axis=1)
cat_data_modified = pd.concat([cat_data,dummy],axis=1)
# Removal of columns in num data with low CV(<0.1)
drop_list =[]
for col_name in num_data_modified.columns:
if num_data_modified[col_name].std()/num_data_modified[col_name].mean()<0.1:
drop_list.append(col_name)
print(drop_list)
num_data_modified = num_data_modified.drop(drop_list,axis=1)
num_data_modified.head()
['lncreddebt', 'lnwireten']
| townsize | gender | agecat | edcat | jobcat | union | employ | empcat | retire | income | ... | ownfax | news | response_01 | response_02 | response_03 | Total_Spend | region_2 | region_3 | region_4 | region_5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.0 | 1 | 2 | 3 | 1 | 1 | 0 | 1 | 0 | 31.0 | ... | 0 | 0 | 0 | 1 | 0 | 149.46 | 0 | 0 | 0 | 0 |
| 1 | 5.0 | 0 | 2 | 4 | 2 | 0 | 0 | 1 | 0 | 15.0 | ... | 1 | 1 | 0 | 0 | 0 | 77.54 | 0 | 0 | 0 | 1 |
| 2 | 4.0 | 1 | 6 | 2 | 2 | 0 | 16 | 5 | 0 | 35.0 | ... | 0 | 1 | 0 | 0 | 0 | 359.97 | 0 | 1 | 0 | 0 |
| 3 | 3.0 | 0 | 2 | 3 | 2 | 0 | 0 | 1 | 0 | 20.0 | ... | 0 | 1 | 1 | 0 | 0 | 359.41 | 0 | 0 | 1 | 0 |
| 4 | 2.0 | 0 | 3 | 3 | 2 | 0 | 1 | 1 | 0 | 23.0 | ... | 0 | 0 | 0 | 1 | 0 | 507.83 | 1 | 0 | 0 | 0 |
5 rows × 107 columns
# Combine num and cat data
modified_data = pd.concat([num_data_modified,cat_data_modified],axis=1)
modified_data.head()
| townsize | gender | agecat | edcat | jobcat | union | employ | empcat | retire | income | ... | birthmonth_December | birthmonth_February | birthmonth_January | birthmonth_July | birthmonth_June | birthmonth_March | birthmonth_May | birthmonth_November | birthmonth_October | birthmonth_September | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.0 | 1 | 2 | 3 | 1 | 1 | 0 | 1 | 0 | 31.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 5.0 | 0 | 2 | 4 | 2 | 0 | 0 | 1 | 0 | 15.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 4.0 | 1 | 6 | 2 | 2 | 0 | 16 | 5 | 0 | 35.0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 3.0 | 0 | 2 | 3 | 2 | 0 | 0 | 1 | 0 | 20.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 2.0 | 0 | 3 | 3 | 2 | 0 | 1 | 1 | 0 | 23.0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 118 columns
# Assumption checking of linear reg
# target variable should be normally distributed
sns.distplot(modified_data.Total_Spend)
plt.show()
modified_data['log_Total_Spend']=np.log(modified_data['Total_Spend'])
sns.distplot(modified_data.log_Total_Spend)
modified_data = modified_data.drop(['Total_Spend'],axis=1)
# VARIBLE REDUCTION AND FEATURE SELECTION
modified_data.corr()
| townsize | gender | agecat | edcat | jobcat | union | employ | empcat | retire | income | ... | birthmonth_February | birthmonth_January | birthmonth_July | birthmonth_June | birthmonth_March | birthmonth_May | birthmonth_November | birthmonth_October | birthmonth_September | log_Total_Spend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| townsize | 1.000000 | 0.009576 | -0.069017 | 0.003744 | -0.010768 | -0.019846 | -0.059221 | -0.058059 | -0.065779 | -0.009478 | ... | -0.024500 | -0.008957 | 0.001582 | 0.005711 | 0.002053 | -0.000498 | -0.019298 | 0.020539 | 0.011270 | 0.006132 |
| gender | 0.009576 | 1.000000 | -0.007427 | -0.002673 | -0.016470 | 0.000311 | -0.011756 | -0.013842 | -0.021040 | -0.001949 | ... | 0.015168 | 0.000704 | -0.013059 | 0.007914 | 0.022453 | -0.009947 | 0.005998 | -0.003610 | -0.025860 | -0.078605 |
| agecat | -0.069017 | -0.007427 | 1.000000 | -0.064124 | 0.032554 | 0.004465 | 0.679902 | 0.732246 | 0.511335 | 0.234081 | ... | 0.019426 | -0.003469 | 0.020750 | 0.006999 | 0.013645 | -0.021179 | 0.007737 | -0.028357 | -0.011852 | 0.030192 |
| edcat | 0.003744 | -0.002673 | -0.064124 | 1.000000 | -0.069002 | -0.003240 | -0.199709 | -0.185672 | -0.089322 | 0.187173 | ... | -0.020811 | 0.009974 | -0.005718 | 0.003428 | -0.014674 | -0.017904 | 0.025506 | 0.023151 | -0.005022 | 0.094717 |
| jobcat | -0.010768 | -0.016470 | 0.032554 | -0.069002 | 1.000000 | 0.084778 | 0.416234 | 0.346723 | 0.173736 | 0.106852 | ... | -0.000695 | -0.006297 | 0.013418 | -0.001318 | -0.003819 | 0.017472 | -0.030736 | -0.005306 | 0.020834 | -0.010592 |
| union | -0.019846 | 0.000311 | 0.004465 | -0.003240 | 0.084778 | 1.000000 | 0.035272 | 0.028028 | 0.016392 | 0.007588 | ... | -0.010492 | 0.000998 | 0.023433 | 0.009049 | -0.015967 | -0.010117 | 0.005503 | -0.020332 | -0.023708 | 0.021191 |
| employ | -0.059221 | -0.011756 | 0.679902 | -0.199709 | 0.416234 | 0.035272 | 1.000000 | 0.903591 | 0.501182 | 0.350449 | ... | 0.024680 | -0.008892 | 0.015298 | 0.008642 | 0.015013 | -0.013904 | 0.002812 | -0.034176 | -0.004174 | 0.069522 |
| empcat | -0.058059 | -0.013842 | 0.732246 | -0.185672 | 0.346723 | 0.028028 | 0.903591 | 1.000000 | 0.408096 | 0.375761 | ... | 0.023954 | 0.005611 | 0.005418 | 0.014542 | 0.028922 | -0.012301 | -0.006657 | -0.035800 | -0.007219 | 0.096982 |
| retire | -0.065779 | -0.021040 | 0.511335 | -0.089322 | 0.173736 | 0.016392 | 0.501182 | 0.408096 | 1.000000 | -0.260484 | ... | -0.005494 | 0.002049 | 0.026712 | 0.002049 | -0.006945 | -0.005054 | -0.016422 | -0.001060 | 0.010554 | -0.198252 |
| income | -0.009478 | -0.001949 | 0.234081 | 0.187173 | 0.106852 | 0.007588 | 0.350449 | 0.375761 | -0.260484 | 1.000000 | ... | 0.009556 | 0.004544 | -0.008630 | 0.012303 | 0.010886 | -0.014965 | 0.017384 | -0.013342 | -0.023206 | 0.355851 |
| inccat | -0.007582 | -0.002149 | 0.249394 | 0.197250 | 0.067555 | 0.013028 | 0.316584 | 0.384803 | -0.326195 | 0.890702 | ... | 0.007728 | 0.009610 | -0.011887 | 0.006658 | 0.026594 | -0.014226 | 0.017837 | -0.020775 | -0.015686 | 0.381432 |
| debtinc | -0.019543 | 0.004746 | 0.040937 | 0.010705 | 0.003673 | -0.004927 | 0.031801 | 0.030637 | 0.024039 | 0.011586 | ... | -0.001494 | -0.001875 | -0.021354 | 0.008257 | 0.015922 | 0.015206 | -0.005337 | -0.006364 | 0.010513 | 0.016104 |
| creddebt | -0.039220 | -0.006574 | 0.159151 | 0.118857 | 0.068486 | 0.000925 | 0.225514 | 0.246794 | -0.158139 | 0.627578 | ... | -0.008404 | -0.004937 | -0.018175 | 0.013222 | 0.010059 | -0.012854 | 0.008394 | -0.001872 | -0.007734 | 0.233370 |
| othdebt | -0.014535 | -0.008475 | 0.176380 | 0.135689 | 0.073217 | -0.007085 | 0.249165 | 0.274617 | -0.182138 | 0.696227 | ... | 0.007184 | -0.000299 | -0.006445 | 0.005330 | 0.015447 | -0.007133 | 0.010572 | -0.007339 | -0.016628 | 0.260430 |
| lnothdebt | -0.002946 | -0.001608 | 0.159017 | 0.155380 | 0.028732 | -0.006545 | 0.195597 | 0.239395 | -0.261438 | 0.597857 | ... | 0.013295 | 0.001695 | -0.021775 | 0.008564 | 0.031811 | -0.004426 | 0.007946 | -0.016603 | -0.007341 | 0.273925 |
| default | 0.023540 | -0.001620 | -0.392859 | 0.103714 | -0.086022 | -0.017210 | -0.329242 | -0.377985 | -0.211482 | -0.017053 | ... | -0.016884 | 0.013000 | 0.002188 | -0.015942 | -0.010990 | -0.004326 | -0.014719 | 0.006847 | 0.022489 | 0.018227 |
| jobsat | -0.031121 | 0.001782 | 0.422564 | -0.065667 | 0.171703 | -0.007664 | 0.464355 | 0.513673 | 0.162462 | 0.262246 | ... | 0.018317 | -0.002072 | -0.001693 | 0.022496 | 0.018319 | -0.020679 | 0.020485 | -0.023424 | -0.015383 | 0.066688 |
| marital | 0.029922 | 0.003888 | 0.004785 | -0.022293 | 0.003116 | -0.004505 | -0.006639 | 0.001605 | -0.001566 | 0.007573 | ... | -0.005385 | -0.021191 | -0.016466 | 0.009115 | -0.011252 | 0.010382 | -0.018613 | 0.022058 | -0.001293 | 0.018688 |
| reside | 0.044371 | 0.031148 | -0.331160 | 0.013518 | -0.014961 | 0.005407 | -0.245573 | -0.257960 | -0.178646 | -0.073381 | ... | -0.020794 | -0.006654 | -0.031597 | 0.010823 | -0.016909 | 0.018676 | -0.034419 | 0.045717 | 0.002575 | 0.003824 |
| pets | -0.000967 | 0.028664 | 0.026966 | 0.030428 | 0.011706 | 0.023959 | 0.024996 | 0.029679 | 0.027544 | 0.008424 | ... | 0.014541 | 0.012767 | 0.005785 | 0.015780 | -0.010474 | 0.003919 | -0.022825 | 0.023022 | -0.026802 | -0.004739 |
| pets_cats | -0.018682 | 0.013703 | 0.021829 | 0.013500 | -0.021394 | -0.020191 | 0.007171 | 0.012764 | 0.012400 | 0.000656 | ... | 0.038672 | -0.001725 | 0.000411 | -0.003479 | 0.001755 | -0.000994 | -0.009576 | 0.003489 | -0.008096 | 0.002658 |
| pets_dogs | 0.031654 | -0.013189 | 0.011527 | 0.028381 | 0.002461 | -0.006949 | 0.003706 | 0.009117 | 0.000369 | 0.022831 | ... | -0.023940 | 0.014541 | 0.010494 | -0.006472 | 0.006480 | 0.011428 | -0.013431 | -0.010572 | 0.019910 | 0.007291 |
| pets_birds | -0.012533 | 0.024293 | -0.008446 | 0.013860 | 0.000999 | -0.011026 | -0.009249 | -0.007224 | -0.008811 | 0.001840 | ... | 0.010454 | -0.000811 | -0.015358 | 0.021012 | -0.003281 | 0.001802 | -0.005256 | 0.000851 | -0.006817 | -0.018381 |
| pets_reptiles | 0.009830 | 0.006001 | 0.014706 | -0.001208 | 0.007075 | 0.022878 | -0.002253 | -0.000579 | 0.002269 | -0.001956 | ... | 0.029346 | 0.007914 | 0.022197 | 0.018466 | 0.005829 | -0.014177 | -0.016067 | -0.022670 | -0.014967 | 0.021463 |
| pets_small | 0.001648 | 0.012543 | 0.001663 | 0.007719 | 0.006446 | 0.000327 | 0.009213 | 0.009584 | 0.002490 | -0.008239 | ... | -0.019398 | -0.009217 | 0.024878 | -0.004740 | -0.007142 | -0.016419 | 0.009139 | 0.002794 | 0.005622 | -0.003719 |
| pets_saltfish | 0.020930 | 0.003988 | -0.002900 | -0.004667 | -0.001681 | -0.016180 | -0.006251 | -0.002873 | 0.008919 | -0.011377 | ... | -0.004979 | -0.008557 | 0.023066 | 0.001744 | -0.015180 | -0.007290 | -0.014121 | 0.023378 | 0.005446 | 0.003564 |
| pets_freshfish | -0.006999 | 0.024280 | 0.021392 | 0.018085 | 0.018599 | 0.034976 | 0.028056 | 0.028895 | 0.026401 | 0.005421 | ... | 0.011218 | 0.013413 | -0.006418 | 0.014602 | -0.007712 | 0.007487 | -0.015362 | 0.026631 | -0.034619 | -0.006442 |
| homeown | 0.024133 | 0.006351 | -0.014793 | 0.054535 | 0.028167 | 0.002338 | 0.020261 | 0.021710 | -0.094162 | 0.143873 | ... | 0.010215 | 0.000848 | -0.012073 | -0.003631 | -0.011867 | -0.005710 | 0.005792 | 0.005833 | 0.028203 | 0.069823 |
| hometype | -0.003217 | 0.003200 | -0.027187 | 0.021945 | -0.022920 | -0.007312 | -0.042648 | -0.039946 | 0.051155 | -0.094639 | ... | -0.008834 | -0.007781 | 0.007931 | 0.000085 | -0.005152 | -0.012196 | 0.019970 | -0.015480 | 0.022759 | -0.016064 |
| addresscat | -0.043250 | -0.006030 | 0.829099 | -0.020501 | 0.041350 | 0.001848 | 0.598611 | 0.664111 | 0.352559 | 0.312723 | ... | 0.023160 | 0.009702 | 0.004769 | 0.026090 | 0.008147 | -0.022043 | -0.000854 | -0.033425 | -0.016062 | 0.074489 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| owntv | 0.026513 | -0.000600 | -0.059942 | 0.030799 | 0.003551 | -0.004958 | 0.015879 | 0.017323 | -0.198260 | 0.114949 | ... | -0.004997 | 0.011936 | 0.000118 | 0.000781 | 0.011607 | 0.014404 | -0.006948 | -0.022725 | -0.001148 | 0.085161 |
| hourstv | -0.002511 | -0.004984 | -0.019651 | -0.009332 | -0.030434 | -0.008694 | 0.000233 | -0.003867 | -0.104254 | 0.055100 | ... | 0.002976 | 0.002793 | -0.009479 | -0.005236 | -0.002358 | 0.001692 | -0.005914 | -0.005683 | 0.016392 | 0.044616 |
| ownvcr | -0.004501 | -0.003570 | 0.126323 | 0.069821 | 0.025124 | 0.005636 | 0.127702 | 0.168115 | -0.145458 | 0.226606 | ... | 0.011123 | -0.011806 | -0.010828 | 0.003756 | 0.023733 | -0.009884 | -0.024756 | -0.027261 | 0.016599 | 0.156810 |
| owndvd | 0.016458 | 0.012181 | 0.120848 | 0.066569 | 0.011966 | 0.012555 | 0.125163 | 0.160621 | -0.155008 | 0.229476 | ... | 0.008012 | -0.009526 | -0.018921 | 0.008438 | 0.025626 | 0.007371 | -0.017145 | -0.032629 | -0.003526 | 0.164081 |
| owncd | 0.020715 | -0.002861 | 0.078888 | 0.056563 | 0.022038 | 0.006251 | 0.107435 | 0.138173 | -0.165318 | 0.206481 | ... | -0.002627 | -0.010874 | 0.002187 | 0.009285 | 0.020117 | -0.010297 | -0.015291 | -0.030420 | 0.024308 | 0.147648 |
| ownpda | 0.022813 | -0.001116 | -0.059490 | 0.293125 | -0.009352 | 0.019565 | -0.069775 | -0.054010 | -0.097558 | 0.145061 | ... | -0.005442 | 0.037027 | -0.019966 | -0.006153 | 0.022378 | 0.004092 | 0.001475 | -0.013479 | -0.005291 | 0.070050 |
| ownpc | 0.002639 | -0.006963 | -0.123192 | 0.414619 | -0.037929 | 0.006489 | -0.162028 | -0.152394 | -0.088906 | 0.049270 | ... | -0.000765 | 0.027258 | -0.006551 | -0.004152 | -0.004875 | -0.010707 | -0.012995 | 0.025030 | -0.008375 | 0.044353 |
| ownipod | 0.005333 | -0.021319 | -0.208690 | 0.357481 | -0.026888 | -0.005895 | -0.198838 | -0.192716 | -0.132789 | 0.032922 | ... | 0.009683 | 0.002506 | -0.010049 | 0.002506 | -0.014999 | -0.001564 | -0.000297 | 0.008066 | 0.018773 | 0.041022 |
| owngame | 0.007898 | 0.001164 | -0.223387 | 0.361191 | -0.050278 | 0.006765 | -0.218677 | -0.211118 | -0.151759 | 0.040452 | ... | -0.013698 | 0.019613 | 0.005685 | 0.012394 | 0.006501 | -0.025354 | -0.006570 | 0.015082 | -0.008967 | 0.041669 |
| ownfax | -0.000562 | 0.009167 | -0.029310 | 0.273646 | 0.008409 | 0.024516 | -0.040587 | -0.035826 | -0.070564 | 0.149388 | ... | -0.007050 | 0.020517 | -0.007289 | 0.009228 | 0.008730 | 0.007946 | 0.014752 | -0.006293 | -0.005230 | 0.062793 |
| news | -0.028756 | -0.020837 | 0.348275 | -0.068420 | 0.090861 | 0.007508 | 0.368778 | 0.369206 | 0.213707 | 0.138730 | ... | -0.010923 | -0.022373 | 0.001188 | 0.020952 | -0.012477 | 0.008191 | -0.020057 | -0.014259 | 0.000763 | 0.029856 |
| response_01 | -0.006252 | 0.018058 | 0.010039 | -0.112653 | 0.000137 | -0.010492 | 0.039085 | 0.030418 | -0.003457 | -0.002440 | ... | 0.026250 | -0.021131 | 0.019617 | -0.000292 | -0.007267 | -0.026998 | -0.000950 | 0.007174 | 0.004286 | -0.009182 |
| response_02 | -0.010070 | 0.001385 | 0.010467 | 0.070172 | 0.002888 | 0.006431 | 0.014410 | 0.017504 | -0.019784 | 0.062895 | ... | 0.005899 | 0.016056 | -0.003475 | 0.005329 | -0.008612 | 0.009265 | 0.009245 | -0.013487 | -0.007114 | 0.026831 |
| response_03 | -0.007230 | -0.009685 | -0.014860 | 0.106223 | -0.008415 | 0.004479 | -0.029312 | -0.032850 | -0.038500 | 0.053011 | ... | -0.002112 | -0.014476 | 0.003894 | 0.006910 | 0.017465 | 0.010877 | -0.012009 | -0.000159 | -0.004549 | 0.061573 |
| region_2 | -0.122630 | 0.005929 | 0.001682 | 0.000149 | -0.016154 | 0.013362 | -0.015869 | -0.021696 | 0.004432 | -0.023106 | ... | -0.016618 | 0.004370 | -0.003980 | 0.004370 | -0.015932 | -0.001310 | 0.006653 | 0.000749 | -0.017613 | -0.003301 |
| region_3 | -0.014877 | 0.000888 | 0.004764 | 0.002879 | -0.007489 | -0.013458 | 0.013236 | 0.020141 | -0.004284 | 0.026175 | ... | 0.003878 | 0.004948 | 0.005719 | 0.019352 | 0.002804 | -0.011281 | -0.005602 | 0.001373 | -0.001515 | -0.001571 |
| region_4 | 0.090608 | -0.017708 | -0.028909 | 0.022514 | -0.006765 | -0.017052 | -0.025024 | -0.030639 | -0.016540 | 0.009581 | ... | -0.008707 | 0.001563 | 0.002224 | -0.014873 | 0.002988 | 0.028058 | -0.028203 | 0.005148 | 0.002659 | 0.009329 |
| region_5 | 0.335490 | -0.000195 | 0.002176 | -0.004145 | 0.008511 | 0.003755 | 0.006725 | 0.009614 | -0.024544 | 0.017907 | ... | -0.005110 | -0.002263 | 0.014693 | 0.004876 | 0.000992 | 0.004086 | 0.011044 | -0.014821 | 0.020469 | 0.029256 |
| birthmonth_August | -0.000043 | 0.003717 | 0.003384 | -0.006546 | -0.008274 | -0.004879 | -0.003698 | -0.007375 | -0.016361 | 0.007474 | ... | -0.089790 | -0.090024 | -0.089203 | -0.090024 | -0.089555 | -0.093605 | -0.087544 | -0.088849 | -0.094401 | 0.006738 |
| birthmonth_December | 0.019912 | -0.016896 | -0.013575 | -0.010645 | 0.008243 | 0.032635 | 0.000555 | -0.009242 | -0.001552 | -0.001010 | ... | -0.088824 | -0.089055 | -0.088243 | -0.089055 | -0.088592 | -0.092597 | -0.086602 | -0.087893 | -0.093385 | 0.022999 |
| birthmonth_February | -0.024500 | 0.015168 | 0.019426 | -0.020811 | -0.000695 | -0.010492 | 0.024680 | 0.023954 | -0.005494 | 0.009556 | ... | 1.000000 | -0.091464 | -0.090630 | -0.091464 | -0.090988 | -0.095102 | -0.088945 | -0.090271 | -0.095911 | -0.018395 |
| birthmonth_January | -0.008957 | 0.000704 | -0.003469 | 0.009974 | -0.006297 | 0.000998 | -0.008892 | 0.005611 | 0.002049 | 0.004544 | ... | -0.091464 | 1.000000 | -0.090866 | -0.091703 | -0.091226 | -0.095350 | -0.089177 | -0.090506 | -0.096161 | 0.000416 |
| birthmonth_July | 0.001582 | -0.013059 | 0.020750 | -0.005718 | 0.013418 | 0.023433 | 0.015298 | 0.005418 | 0.026712 | -0.008630 | ... | -0.090630 | -0.090866 | 1.000000 | -0.090866 | -0.090393 | -0.094480 | -0.088363 | -0.089680 | -0.095284 | -0.006076 |
| birthmonth_June | 0.005711 | 0.007914 | 0.006999 | 0.003428 | -0.001318 | 0.009049 | 0.008642 | 0.014542 | 0.002049 | 0.012303 | ... | -0.091464 | -0.091703 | -0.090866 | 1.000000 | -0.091226 | -0.095350 | -0.089177 | -0.090506 | -0.096161 | 0.005361 |
| birthmonth_March | 0.002053 | 0.022453 | 0.013645 | -0.014674 | -0.003819 | -0.015967 | 0.015013 | 0.028922 | -0.006945 | 0.010886 | ... | -0.090988 | -0.091226 | -0.090393 | -0.091226 | 1.000000 | -0.094854 | -0.088712 | -0.090035 | -0.095661 | 0.001327 |
| birthmonth_May | -0.000498 | -0.009947 | -0.021179 | -0.017904 | 0.017472 | -0.010117 | -0.013904 | -0.012301 | -0.005054 | -0.014965 | ... | -0.095102 | -0.095350 | -0.094480 | -0.095350 | -0.094854 | 1.000000 | -0.092724 | -0.094106 | -0.099986 | -0.003625 |
| birthmonth_November | -0.019298 | 0.005998 | 0.007737 | 0.025506 | -0.030736 | 0.005503 | 0.002812 | -0.006657 | -0.016422 | 0.017384 | ... | -0.088945 | -0.089177 | -0.088363 | -0.089177 | -0.088712 | -0.092724 | 1.000000 | -0.088013 | -0.093512 | 0.007104 |
| birthmonth_October | 0.020539 | -0.003610 | -0.028357 | 0.023151 | -0.005306 | -0.020332 | -0.034176 | -0.035800 | -0.001060 | -0.013342 | ... | -0.090271 | -0.090506 | -0.089680 | -0.090506 | -0.090035 | -0.094106 | -0.088013 | 1.000000 | -0.094906 | -0.000069 |
| birthmonth_September | 0.011270 | -0.025860 | -0.011852 | -0.005022 | 0.020834 | -0.023708 | -0.004174 | -0.007219 | 0.010554 | -0.023206 | ... | -0.095911 | -0.096161 | -0.095284 | -0.096161 | -0.095661 | -0.099986 | -0.093512 | -0.094906 | 1.000000 | -0.007363 |
| log_Total_Spend | 0.006132 | -0.078605 | 0.030192 | 0.094717 | -0.010592 | 0.021191 | 0.069522 | 0.096982 | -0.198252 | 0.355851 | ... | -0.018395 | 0.000416 | -0.006076 | 0.005361 | 0.001327 | -0.003625 | 0.007104 | -0.000069 | -0.007363 | 1.000000 |
118 rows × 118 columns
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
correlation(modified_data, 0.6)
features = modified_data[modified_data.columns.difference(['log_Total_Spend'])]
target = modified_data['log_Total_Spend']
F_values, p_values = f_regression(features, target )
f_reg_results = [(i, v, z) for i, v, z in itertools.zip_longest(features.columns, F_values, ['%.3f' % p for p in p_values])]
f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])
f_reg_results
| Variable | F_Value | P_Value | |
|---|---|---|---|
| 0 | active | 0.099140 | 0.753 |
| 1 | agecat | 4.560250 | 0.033 |
| 2 | bfast | 4.065689 | 0.044 |
| 3 | birthmonth_August | 0.226917 | 0.634 |
| 4 | birthmonth_December | 2.645083 | 0.104 |
| 5 | birthmonth_February | 1.691716 | 0.193 |
| 6 | birthmonth_January | 0.000866 | 0.977 |
| 7 | birthmonth_July | 0.184530 | 0.668 |
| 8 | birthmonth_June | 0.143664 | 0.705 |
| 9 | birthmonth_March | 0.008795 | 0.925 |
| 10 | birthmonth_May | 0.065670 | 0.798 |
| 11 | birthmonth_November | 0.252214 | 0.616 |
| 12 | birthmonth_October | 0.000024 | 0.996 |
| 13 | birthmonth_September | 0.270970 | 0.603 |
| 14 | callcard | 2.314608 | 0.128 |
| 15 | carbought | 0.160034 | 0.689 |
| 16 | carbuy | 1.277132 | 0.258 |
| 17 | card | 298.331861 | 0.000 |
| 18 | card2 | 29.990236 | 0.000 |
| 19 | card2benefit | 0.007779 | 0.930 |
| 20 | card2fee | 1.945245 | 0.163 |
| 21 | card2type | 0.404878 | 0.525 |
| 22 | cardbenefit | 2.018254 | 0.155 |
| 23 | cardfee | 0.364467 | 0.546 |
| 24 | cardtype | 0.000029 | 0.996 |
| 25 | carown | 27.674973 | 0.000 |
| 26 | cars | 0.995049 | 0.319 |
| 27 | cartype | 0.059694 | 0.807 |
| 28 | churn | 1.873489 | 0.171 |
| 29 | commutebike | 3.309558 | 0.069 |
| ... | ... | ... | ... |
| 58 | ownipod | 8.424830 | 0.004 |
| 59 | ownpda | 24.646407 | 0.000 |
| 60 | owntv | 36.512224 | 0.000 |
| 61 | ownvcr | 125.995381 | 0.000 |
| 62 | pets | 0.112238 | 0.738 |
| 63 | pets_birds | 1.689187 | 0.194 |
| 64 | pets_cats | 0.035311 | 0.851 |
| 65 | pets_dogs | 0.265735 | 0.606 |
| 66 | pets_reptiles | 2.303407 | 0.129 |
| 67 | pets_saltfish | 0.063503 | 0.801 |
| 68 | pets_small | 0.069119 | 0.793 |
| 69 | polcontrib | 5.567432 | 0.018 |
| 70 | polparty | 0.140413 | 0.708 |
| 71 | polview | 1.084997 | 0.298 |
| 72 | reason | 1.308831 | 0.253 |
| 73 | region_2 | 0.054454 | 0.815 |
| 74 | region_3 | 0.012331 | 0.912 |
| 75 | region_4 | 0.434984 | 0.510 |
| 76 | region_5 | 4.281407 | 0.039 |
| 77 | response_01 | 0.421452 | 0.516 |
| 78 | response_02 | 3.600705 | 0.058 |
| 79 | response_03 | 19.020900 | 0.000 |
| 80 | retire | 204.476976 | 0.000 |
| 81 | telecommute | 0.044376 | 0.833 |
| 82 | tenure | 14.163837 | 0.000 |
| 83 | tollfree | 14.792822 | 0.000 |
| 84 | townsize | 0.187955 | 0.665 |
| 85 | union | 2.245453 | 0.134 |
| 86 | vote | 15.628862 | 0.000 |
| 87 | wireless | 27.549440 | 0.000 |
88 rows × 3 columns
f_reg_results.P_Value = f_reg_results.P_Value.astype('float')
# selected vars on basis of on importance using F-Regression
selected_features_from_f_reg = f_reg_results[f_reg_results.P_Value<=0.05].Variable.tolist()
ligreg = LinearRegression()
rfe = RFE(ligreg, 15)
rfe = rfe.fit(features, target )
print(rfe.support_)
print(rfe.ranking_)
[False True False False True True False False False False False False False False False False False True True False False False False False False False False False False False False False False False False False False False False False True False False False True True False False False False False False False False True True False False False False True True False False False False True False False False False False False False False False False False False True True False False False False False False False] [36 1 57 29 1 1 31 28 20 37 43 30 23 40 51 3 34 1 1 74 16 65 73 41 60 2 54 32 18 4 25 17 70 33 6 21 26 69 44 68 1 19 24 66 1 1 42 67 52 13 38 64 48 50 1 1 53 27 46 58 1 1 61 15 56 62 1 14 71 8 49 47 63 11 12 7 5 39 35 1 1 22 72 9 45 10 55 59]
feature_map = [(i, v) for i, v in itertools.zip_longest(features.columns, rfe.get_support())]
feature_map
[('active', False),
('agecat', True),
('bfast', False),
('birthmonth_August', False),
('birthmonth_December', True),
('birthmonth_February', True),
('birthmonth_January', False),
('birthmonth_July', False),
('birthmonth_June', False),
('birthmonth_March', False),
('birthmonth_May', False),
('birthmonth_November', False),
('birthmonth_October', False),
('birthmonth_September', False),
('callcard', False),
('carbought', False),
('carbuy', False),
('card', True),
('card2', True),
('card2benefit', False),
('card2fee', False),
('card2type', False),
('cardbenefit', False),
('cardfee', False),
('cardtype', False),
('carown', False),
('cars', False),
('cartype', False),
('churn', False),
('commutebike', False),
('commutebus', False),
('commutecarpool', False),
('commutecat', False),
('commutemotorcycle', False),
('commutenonmotor', False),
('commutepublic', False),
('commuterail', False),
('commutetime', False),
('commutewalk', False),
('debtinc', False),
('default', True),
('ebill', False),
('edcat', False),
('equip', False),
('gender', True),
('homeown', True),
('hometype', False),
('hourstv', False),
('income', False),
('internet', False),
('jobcat', False),
('jobsat', False),
('marital', False),
('multline', False),
('owncd', True),
('owndvd', True),
('ownfax', False),
('owngame', False),
('ownipod', False),
('ownpda', False),
('owntv', True),
('ownvcr', True),
('pets', False),
('pets_birds', False),
('pets_cats', False),
('pets_dogs', False),
('pets_reptiles', True),
('pets_saltfish', False),
('pets_small', False),
('polcontrib', False),
('polparty', False),
('polview', False),
('reason', False),
('region_2', False),
('region_3', False),
('region_4', False),
('region_5', False),
('response_01', False),
('response_02', False),
('response_03', True),
('retire', True),
('telecommute', False),
('tenure', False),
('tollfree', False),
('townsize', False),
('union', False),
('vote', False),
('wireless', False)]
selected_features_from_rfe = features.columns[rfe.get_support()].tolist()
Kbest = SelectKBest(f_classif, k=15).fit(features, target )
Kbest.get_support()
array([False, False, False, True, False, False, False, False, False,
True, True, False, False, True, False, False, False, False,
True, True, False, False, False, False, False, False, False,
False, True, False, False, False, False, False, False, False,
False, False, False, False, True, False, True, False, False,
False, False, False, False, False, False, False, False, True,
False, False, True, False, False, True, False, False, False,
False, False, False, False, True, False, False, False, False,
True, False, False, False, False, False, False, True, False,
False, False, False, False, False, False, False])
Kbest.scores_
array([0.99558822, 0.97085501, 1.08477987, 1.26632032, 1.02719361,
0.81153137, 0.97480409, 0.77302604, 0.82525139, 1.13961749,
1.43564649, 1.03391606, 0.85953583, 1.22335687, 1.04965806,
0.97175901, 1.00279754, 1.10262509, 1.19796941, 1.15461919,
1.10060702, 0.90447528, 1.01883609, 0.96917035, 0.95537703,
1.06514188, 1.11380553, 1.01612211, 1.12647526, 1.10381263,
0.95756014, 1.08831692, 1.01056788, 0.83970671, 1.02716352,
0.88757259, 1.00918474, 1.07609142, 0.96001115, 1.08891051,
1.38527749, 1.0836507 , 1.21112998, 1.04516152, 1.03391763,
0.99849439, 0.95652301, 0.97615846, 0.87840249, 1.08031759,
0.99283896, 0.91835811, 1.03629401, 1.12262108, 0.67874299,
0.87138107, 1.14909338, 1.07209666, 1.01972701, 1.1956037 ,
0.81658156, 0.81177889, 1.00406531, 0.84243065, 1.11058433,
0.91059936, 0.54425231, 1.60300973, 0.91717454, 0.94030348,
0.95955948, 0.85516238, 1.1369187 , 1.08143932, 0.96285138,
1.01352975, 0.92625826, 0.93166142, 1.06914603, 1.38895226,
0.84254466, 0.92486957, 1.10917125, 0.96617107, 0.97101078,
0.97705109, 0.93671688, 1.10068944])
feature_map = [(i, v) for i, v in itertools.zip_longest(features.columns, X_new.get_support())]
feature_map
[('active', False),
('agecat', False),
('bfast', False),
('birthmonth_August', True),
('birthmonth_December', False),
('birthmonth_February', False),
('birthmonth_January', False),
('birthmonth_July', False),
('birthmonth_June', False),
('birthmonth_March', True),
('birthmonth_May', True),
('birthmonth_November', False),
('birthmonth_October', False),
('birthmonth_September', True),
('callcard', False),
('carbought', False),
('carbuy', False),
('card', True),
('card2', True),
('card2benefit', True),
('card2fee', False),
('card2type', False),
('cardbenefit', False),
('cardfee', False),
('cardtype', False),
('carown', False),
('cars', True),
('cartype', False),
('churn', True),
('commutebike', True),
('commutebus', False),
('commutecarpool', False),
('commutecat', False),
('commutemotorcycle', False),
('commutenonmotor', False),
('commutepublic', False),
('commuterail', False),
('commutetime', False),
('commutewalk', False),
('debtinc', False),
('default', True),
('ebill', False),
('edcat', True),
('equip', False),
('gender', False),
('homeown', False),
('hometype', False),
('hourstv', False),
('income', False),
('internet', False),
('jobcat', False),
('jobsat', False),
('marital', False),
('multline', True),
('owncd', False),
('owndvd', False),
('ownfax', True),
('owngame', False),
('ownipod', False),
('ownpda', True),
('owntv', False),
('ownvcr', False),
('pets', False),
('pets_birds', False),
('pets_cats', True),
('pets_dogs', False),
('pets_reptiles', False),
('pets_saltfish', True),
('pets_small', False),
('polcontrib', False),
('polparty', False),
('polview', False),
('reason', True),
('region_2', False),
('region_3', False),
('region_4', False),
('region_5', False),
('response_01', False),
('response_02', False),
('response_03', True),
('retire', False),
('telecommute', False),
('tenure', True),
('tollfree', False),
('townsize', False),
('union', False),
('vote', False),
('wireless', False)]
selected_features_from_Kbest = features.columns[Kbest.get_support()].tolist()
# Create a combined list of features selected from all above techniques and perform VIF reduction technique to get final list of features
selected_features = set(selected_features_from_f_reg + selected_features_from_rfe + selected_features_from_Kbest)
invalid_symbols = "-+*%/" #these symbols are needed to removed as they may mislead equation
col_list = []
for col_name in modified_data.columns:
for symbol in invalid_symbols:
col_name = col_name.replace(symbol,'_')
col_list.append(col_name)
modified_data.columns = col_list
col_list = []
for col_name in selected_features:
for symbol in invalid_symbols:
col_name = col_name.replace(symbol,'_')
col_list.append(col_name)
selected_features = col_list
feature_string = '+'.join(set(selected_features))
equation = 'log_Total_Spend~' + feature_string
equation
'log_Total_Spend~homeown+gender+jobsat+card2+tenure+agecat+ownfax+multline+birthmonth_September+retire+birthmonth_May+birthmonth_February+wireless+reason+response_03+tollfree+ownipod+card+owngame+pets_saltfish+birthmonth_December+vote+ebill+birthmonth_August+owncd+hourstv+polcontrib+birthmonth_March+bfast+equip+income+internet+owntv+ownvcr+churn+edcat+pets_reptiles+default+ownpda+owndvd+region_5+card2benefit+carown'
Y,X=dmatrices(equation,modified_data,return_type='dataframe')
vif = pd.DataFrame()
vif['VIF_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif
| VIF_value | features | |
|---|---|---|
| 0 | 131.168098 | Intercept |
| 1 | 1.048420 | homeown |
| 2 | 1.013753 | gender |
| 3 | 1.296038 | jobsat |
| 4 | 1.200575 | card2 |
| 5 | 2.353928 | tenure |
| 6 | 2.468654 | agecat |
| 7 | 1.554446 | ownfax |
| 8 | 1.624487 | multline |
| 9 | 1.087524 | birthmonth_September |
| 10 | 1.915380 | retire |
| 11 | 1.084161 | birthmonth_May |
| 12 | 1.080374 | birthmonth_February |
| 13 | 2.037969 | wireless |
| 14 | 1.005953 | reason |
| 15 | 1.028092 | response_03 |
| 16 | 1.318517 | tollfree |
| 17 | 1.447290 | ownipod |
| 18 | 1.184513 | card |
| 19 | 1.488155 | owngame |
| 20 | 1.006383 | pets_saltfish |
| 21 | 1.078146 | birthmonth_December |
| 22 | 1.033149 | vote |
| 23 | 1.837233 | ebill |
| 24 | 1.077572 | birthmonth_August |
| 25 | 1.544562 | owncd |
| 26 | 1.351866 | hourstv |
| 27 | 1.012463 | polcontrib |
| 28 | 1.079688 | birthmonth_March |
| 29 | 1.024261 | bfast |
| 30 | 1.981564 | equip |
| 31 | 1.584787 | income |
| 32 | 2.041438 | internet |
| 33 | 1.651319 | owntv |
| 34 | 1.565991 | ownvcr |
| 35 | 1.339283 | churn |
| 36 | 1.526091 | edcat |
| 37 | 1.010631 | pets_reptiles |
| 38 | 1.220729 | default |
| 39 | 1.586606 | ownpda |
| 40 | 1.541494 | owndvd |
| 41 | 1.011562 | region_5 |
| 42 | 1.008324 | card2benefit |
| 43 | 1.056304 | carown |
#Here, we take cut off of VIF is 2 and reduce variable(highest VIF >2) one by one
X = X.drop(['agecat'],axis=1)
vif = pd.DataFrame()
vif['VIF_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif
| VIF_value | features | |
|---|---|---|
| 0 | 119.100773 | Intercept |
| 1 | 1.044490 | homeown |
| 2 | 1.013753 | gender |
| 3 | 1.238088 | jobsat |
| 4 | 1.200510 | card2 |
| 5 | 2.168830 | tenure |
| 6 | 1.554144 | ownfax |
| 7 | 1.624446 | multline |
| 8 | 1.087517 | birthmonth_September |
| 9 | 1.536613 | retire |
| 10 | 1.083883 | birthmonth_May |
| 11 | 1.080257 | birthmonth_February |
| 12 | 2.037064 | wireless |
| 13 | 1.005198 | reason |
| 14 | 1.027983 | response_03 |
| 15 | 1.318432 | tollfree |
| 16 | 1.434989 | ownipod |
| 17 | 1.184391 | card |
| 18 | 1.473335 | owngame |
| 19 | 1.006287 | pets_saltfish |
| 20 | 1.077958 | birthmonth_December |
| 21 | 1.033146 | vote |
| 22 | 1.837233 | ebill |
| 23 | 1.077565 | birthmonth_August |
| 24 | 1.544491 | owncd |
| 25 | 1.351559 | hourstv |
| 26 | 1.012221 | polcontrib |
| 27 | 1.079641 | birthmonth_March |
| 28 | 1.019810 | bfast |
| 29 | 1.981518 | equip |
| 30 | 1.465928 | income |
| 31 | 2.041383 | internet |
| 32 | 1.647444 | owntv |
| 33 | 1.563752 | ownvcr |
| 34 | 1.339220 | churn |
| 35 | 1.523462 | edcat |
| 36 | 1.009993 | pets_reptiles |
| 37 | 1.145590 | default |
| 38 | 1.586099 | ownpda |
| 39 | 1.538527 | owndvd |
| 40 | 1.011557 | region_5 |
| 41 | 1.008288 | card2benefit |
| 42 | 1.055629 | carown |
X = X.drop(['tenure'],axis=1)
vif = pd.DataFrame()
vif['VIF_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif
| VIF_value | features | |
|---|---|---|
| 0 | 118.171093 | Intercept |
| 1 | 1.040115 | homeown |
| 2 | 1.013745 | gender |
| 3 | 1.222072 | jobsat |
| 4 | 1.200185 | card2 |
| 5 | 1.553072 | ownfax |
| 6 | 1.308722 | multline |
| 7 | 1.087360 | birthmonth_September |
| 8 | 1.370093 | retire |
| 9 | 1.083881 | birthmonth_May |
| 10 | 1.080198 | birthmonth_February |
| 11 | 2.036965 | wireless |
| 12 | 1.005145 | reason |
| 13 | 1.027580 | response_03 |
| 14 | 1.318262 | tollfree |
| 15 | 1.429605 | ownipod |
| 16 | 1.184388 | card |
| 17 | 1.468206 | owngame |
| 18 | 1.006287 | pets_saltfish |
| 19 | 1.077634 | birthmonth_December |
| 20 | 1.032727 | vote |
| 21 | 1.836833 | ebill |
| 22 | 1.077524 | birthmonth_August |
| 23 | 1.537616 | owncd |
| 24 | 1.351256 | hourstv |
| 25 | 1.011734 | polcontrib |
| 26 | 1.079625 | birthmonth_March |
| 27 | 1.018385 | bfast |
| 28 | 1.981487 | equip |
| 29 | 1.393548 | income |
| 30 | 2.037568 | internet |
| 31 | 1.645179 | owntv |
| 32 | 1.547771 | ownvcr |
| 33 | 1.240679 | churn |
| 34 | 1.511330 | edcat |
| 35 | 1.009978 | pets_reptiles |
| 36 | 1.120584 | default |
| 37 | 1.585633 | ownpda |
| 38 | 1.520743 | owndvd |
| 39 | 1.011536 | region_5 |
| 40 | 1.008267 | card2benefit |
| 41 | 1.055487 | carown |
X = X.drop(['internet'],axis=1)
vif = pd.DataFrame()
vif['VIF_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif
| VIF_value | features | |
|---|---|---|
| 0 | 118.159283 | Intercept |
| 1 | 1.039957 | homeown |
| 2 | 1.013245 | gender |
| 3 | 1.221724 | jobsat |
| 4 | 1.200044 | card2 |
| 5 | 1.549581 | ownfax |
| 6 | 1.297454 | multline |
| 7 | 1.087360 | birthmonth_September |
| 8 | 1.370018 | retire |
| 9 | 1.083434 | birthmonth_May |
| 10 | 1.080164 | birthmonth_February |
| 11 | 2.021109 | wireless |
| 12 | 1.005093 | reason |
| 13 | 1.027292 | response_03 |
| 14 | 1.310651 | tollfree |
| 15 | 1.417757 | ownipod |
| 16 | 1.183841 | card |
| 17 | 1.444546 | owngame |
| 18 | 1.005451 | pets_saltfish |
| 19 | 1.077303 | birthmonth_December |
| 20 | 1.032704 | vote |
| 21 | 1.745226 | ebill |
| 22 | 1.077333 | birthmonth_August |
| 23 | 1.537475 | owncd |
| 24 | 1.350008 | hourstv |
| 25 | 1.011464 | polcontrib |
| 26 | 1.079602 | birthmonth_March |
| 27 | 1.018327 | bfast |
| 28 | 1.850972 | equip |
| 29 | 1.393364 | income |
| 30 | 1.644324 | owntv |
| 31 | 1.547591 | ownvcr |
| 32 | 1.235163 | churn |
| 33 | 1.486467 | edcat |
| 34 | 1.009738 | pets_reptiles |
| 35 | 1.120559 | default |
| 36 | 1.581385 | ownpda |
| 37 | 1.520732 | owndvd |
| 38 | 1.010957 | region_5 |
| 39 | 1.008262 | card2benefit |
| 40 | 1.055390 | carown |
X = X.drop(['wireless'],axis=1)
vif = pd.DataFrame()
vif['VIF_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif
| VIF_value | features | |
|---|---|---|
| 0 | 118.065000 | Intercept |
| 1 | 1.039955 | homeown |
| 2 | 1.013059 | gender |
| 3 | 1.221721 | jobsat |
| 4 | 1.200044 | card2 |
| 5 | 1.415770 | ownfax |
| 6 | 1.293974 | multline |
| 7 | 1.087166 | birthmonth_September |
| 8 | 1.366950 | retire |
| 9 | 1.083145 | birthmonth_May |
| 10 | 1.080120 | birthmonth_February |
| 11 | 1.005023 | reason |
| 12 | 1.027284 | response_03 |
| 13 | 1.216305 | tollfree |
| 14 | 1.415467 | ownipod |
| 15 | 1.182758 | card |
| 16 | 1.439532 | owngame |
| 17 | 1.005218 | pets_saltfish |
| 18 | 1.077056 | birthmonth_December |
| 19 | 1.032451 | vote |
| 20 | 1.728414 | ebill |
| 21 | 1.077241 | birthmonth_August |
| 22 | 1.537475 | owncd |
| 23 | 1.349905 | hourstv |
| 24 | 1.011271 | polcontrib |
| 25 | 1.079420 | birthmonth_March |
| 26 | 1.018140 | bfast |
| 27 | 1.825734 | equip |
| 28 | 1.392506 | income |
| 29 | 1.644253 | owntv |
| 30 | 1.547429 | ownvcr |
| 31 | 1.234956 | churn |
| 32 | 1.483992 | edcat |
| 33 | 1.009656 | pets_reptiles |
| 34 | 1.120459 | default |
| 35 | 1.442643 | ownpda |
| 36 | 1.520009 | owndvd |
| 37 | 1.010928 | region_5 |
| 38 | 1.008186 | card2benefit |
| 39 | 1.055348 | carown |
vif = vif.loc[1:,:]
final_selected_vars = list(vif.features) +['log_Total_Spend']
print(final_selected_vars)
['homeown', 'gender', 'jobsat', 'card2', 'ownfax', 'multline', 'birthmonth_September', 'retire', 'birthmonth_May', 'birthmonth_February', 'reason', 'response_03', 'tollfree', 'ownipod', 'card', 'owngame', 'pets_saltfish', 'birthmonth_December', 'vote', 'ebill', 'birthmonth_August', 'owncd', 'hourstv', 'polcontrib', 'birthmonth_March', 'bfast', 'equip', 'income', 'owntv', 'ownvcr', 'churn', 'edcat', 'pets_reptiles', 'default', 'ownpda', 'owndvd', 'region_5', 'card2benefit', 'carown', 'log_Total_Spend']
modified_data = modified_data.loc[final_selected_vars]
modified_data.head()
| homeown | gender | jobsat | card2 | ownfax | multline | birthmonth_September | retire | birthmonth_May | birthmonth_February | ... | churn | edcat | pets_reptiles | default | ownpda | owndvd | region_5 | card2benefit | carown | log_Total_Spend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 5 | 0 | 1 | 1 | 0 | 0 | 0 | ... | 1 | 3 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 5.007029 |
| 1 | 1 | 0 | 1 | 4 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 4 | 0 | 1 | 1 | 1 | 1 | 3 | 1 | 4.350794 |
| 2 | 1 | 1 | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 1 | 0 | 3 | 1 | 5.886021 |
| 3 | 1 | 0 | 2 | 3 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 0 | 3 | 0 | 1 | 0 | 1 | 0 | 4 | 1 | 5.884464 |
| 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 6.230147 |
5 rows × 40 columns
from sklearn.model_selection import train_test_split
train,test = train_test_split(modified_data,test_size=0.3,random_state = 10)
train_X,test_X,train_y,test_y =train_test_split(modified_data.loc[:,modified_data.columns.difference(['log_Total_Spend'])],modified_data['log_Total_Spend'],test_size=0.3,random_state = 10)
train_X['intercept'] = 1.0
test_X['intercept'] = 1.0
import statsmodels.formula.api as smf
linear_model = smf.OLS(train_y,train_X).fit()
print(linear_model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: log_Total_Spend R-squared: 0.234
Model: OLS Adj. R-squared: 0.225
Method: Least Squares F-statistic: 27.12
Date: Sun, 20 Oct 2019 Prob (F-statistic): 2.99e-169
Time: 03:14:26 Log-Likelihood: -2964.0
No. Observations: 3500 AIC: 6008.
Df Residuals: 3460 BIC: 6254.
Df Model: 39
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
bfast 0.0059 0.012 0.505 0.613 -0.017 0.029
birthmonth_August -0.0170 0.037 -0.460 0.645 -0.089 0.055
birthmonth_December 0.0392 0.037 1.072 0.284 -0.033 0.111
birthmonth_February -0.0531 0.036 -1.489 0.137 -0.123 0.017
birthmonth_March -0.0131 0.037 -0.358 0.720 -0.085 0.059
birthmonth_May -0.0182 0.035 -0.513 0.608 -0.088 0.051
birthmonth_September -0.0026 0.035 -0.076 0.939 -0.070 0.065
card -0.1620 0.009 -18.355 0.000 -0.179 -0.145
card2 -0.0864 0.009 -9.630 0.000 -0.104 -0.069
card2benefit -0.0009 0.009 -0.108 0.914 -0.018 0.016
carown 0.0123 0.015 0.814 0.416 -0.017 0.042
churn 0.0367 0.024 1.500 0.134 -0.011 0.085
default 0.0040 0.024 0.168 0.867 -0.043 0.051
ebill -0.0224 0.026 -0.846 0.397 -0.074 0.029
edcat -0.0125 0.010 -1.297 0.195 -0.031 0.006
equip 0.0225 0.027 0.825 0.409 -0.031 0.076
gender -0.0867 0.019 -4.485 0.000 -0.125 -0.049
homeown 0.0055 0.020 0.272 0.786 -0.034 0.045
hourstv 0.0028 0.002 1.253 0.210 -0.002 0.007
income 0.0042 0.000 16.941 0.000 0.004 0.005
jobsat -0.0017 0.008 -0.219 0.827 -0.017 0.013
multline -0.0114 0.022 -0.524 0.600 -0.054 0.031
owncd 0.0650 0.047 1.384 0.166 -0.027 0.157
owndvd 0.1135 0.041 2.754 0.006 0.033 0.194
ownfax -0.0072 0.030 -0.242 0.809 -0.066 0.051
owngame 0.0130 0.023 0.561 0.575 -0.033 0.059
ownipod 0.0140 0.023 0.610 0.542 -0.031 0.059
ownpda 0.0173 0.029 0.595 0.552 -0.040 0.074
owntv -0.2390 0.103 -2.316 0.021 -0.441 -0.037
ownvcr 0.1076 0.042 2.549 0.011 0.025 0.190
pets_reptiles 0.0206 0.036 0.574 0.566 -0.050 0.091
pets_saltfish 0.0184 0.046 0.399 0.690 -0.072 0.109
polcontrib 0.0227 0.022 1.014 0.311 -0.021 0.067
reason -0.0024 0.003 -0.711 0.477 -0.009 0.004
region_5 0.0309 0.024 1.281 0.200 -0.016 0.078
response_03 0.0692 0.032 2.132 0.033 0.006 0.133
retire -0.1875 0.032 -5.937 0.000 -0.249 -0.126
tollfree -0.0364 0.021 -1.717 0.086 -0.078 0.005
vote 0.0191 0.020 0.976 0.329 -0.019 0.057
intercept 6.4598 0.113 57.303 0.000 6.239 6.681
==============================================================================
Omnibus: 95.142 Durbin-Watson: 2.041
Prob(Omnibus): 0.000 Jarque-Bera (JB): 104.277
Skew: -0.390 Prob(JB): 2.27e-23
Kurtosis: 3.328 Cond. No. 1.04e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.04e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
y_pred_test = linear_model.predict(test_X)
y_pred_train = linear_model.predict(train_X)
actual_spend_train = np.exp(train_y)
actual_spend_test = np.exp(test_y)
spend_pred_train = np.exp(y_pred_train)
spend_pred_test = np.exp(y_pred_test)
#MSE
print(metrics.mean_squared_error(actual_spend_train,spend_pred_train))
print(metrics.mean_squared_error(actual_spend_test,spend_pred_test))
82495.1631639484 84992.84019603771
#RMSE
print(np.sqrt(metrics.mean_squared_error(actual_spend_train,spend_pred_train)))
print(np.sqrt(metrics.mean_squared_error(actual_spend_test,spend_pred_test)))
287.21971235266636 291.5353155211864
#MAE
from sklearn import metrics
print(metrics.mean_absolute_error(actual_spend_train,spend_pred_train))
print(metrics.mean_absolute_error(actual_spend_test,spend_pred_test))
199.3886372373168 201.6426345030227
#MAPE
print(sum(abs((actual_spend_train-spend_pred_train)/actual_spend_train))/len(actual_spend_train))
print(sum(abs((actual_spend_test-spend_pred_test)/actual_spend_test))/len(actual_spend_test))
0.5181974602343599 0.5130459743673423
# top 15 drivers and their coefficient in linear regresssion equation
np.abs(linear_model.params).sort_values(ascending=False)[1:16]
owntv 0.239017 retire 0.187452 card 0.162023 owndvd 0.113493 ownvcr 0.107617 gender 0.086737 card2 0.086371 response_03 0.069202 owncd 0.065029 birthmonth_February 0.053051 birthmonth_December 0.039194 churn 0.036748 tollfree 0.036402 region_5 0.030911 polcontrib 0.022725 dtype: float64
# top 10 positive drivers and their coefficient in linear regresssion equation
linear_model.params.sort_values(ascending=False)[1:11]
owndvd 0.113493 ownvcr 0.107617 response_03 0.069202 owncd 0.065029 birthmonth_December 0.039194 churn 0.036748 region_5 0.030911 polcontrib 0.022725 equip 0.022475 pets_reptiles 0.020617 dtype: float64
# top 10 negative drivers and their coefficient in linear regresssion equation
linear_model.params.sort_values(ascending=True)[0:10]
owntv -0.239017 retire -0.187452 card -0.162023 gender -0.086737 card2 -0.086371 birthmonth_February -0.053051 tollfree -0.036402 ebill -0.022393 birthmonth_May -0.018204 birthmonth_August -0.016990 dtype: float64
import statsmodels.api as sm
sm.qqplot(linear_model.resid,fit=True,line='45')
plt.show()
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
visualizer = ResidualsPlot(model).fit(train_X,train_y)
visualizer.score(test_X,test_y)
visualizer.poof()
# this plot show that there is no pattern in residuals vs predicted value
#so Homoscedasticity assumption is checked
# Evaluating metrics and residual analysis, we can observe that linear regression fits good on data
# But accuracy can be imporoved
# The main purpose for using linear regression is to identify positive and negative drivers
# To improve accuracy, models such as DTs with or without ensembling can be used
train_X = train_X.drop(['intercept'],axis=1)
test_X = test_X.drop(['intercept'],axis=1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50,100,200,300],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': np.arange(3, 16),
'max_features': np.arange(3,7),
'bootstrap': [False, True],
'n_jobs':[-1]
}
tree = GridSearchCV(RandomForestRegressor(), param_grid, cv = 10)
tree.fit(train_X, train_y)
GridSearchCV(cv=10, error_score='raise-deprecating',
estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
oob_score=False, random_state=None, verbose=0, warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'n_estimators': [50, 100, 200, 300], 'max_features': array([3, 4, 5, 6]), 'max_depth': array([ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]), 'bootstrap': [False, True], 'n_jobs': [-1]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
tree.best_params_
{'bootstrap': True,
'max_depth': 11,
'max_features': 6,
'n_estimators': 100,
'n_jobs': -1}
y_pred_test = tree.predict(test_X)
y_pred_train = tree.predict(train_X)
actual_spend_train = np.exp(train_y)
actual_spend_test = np.exp(test_y)
spend_pred_train = np.exp(y_pred_train)
spend_pred_test = np.exp(y_pred_test)
#MSE
print(metrics.mean_squared_error(actual_spend_train,spend_pred_train))
print(metrics.mean_squared_error(actual_spend_test,spend_pred_test))
47328.10521761513 81344.62293322243
#RMSE
print(np.sqrt(metrics.mean_squared_error(actual_spend_train,spend_pred_train)))
print(np.sqrt(metrics.mean_squared_error(actual_spend_test,spend_pred_test)))
217.55023607804964 285.20978758314453
#MAE
from sklearn import metrics
print(metrics.mean_absolute_error(actual_spend_train,spend_pred_train))
print(metrics.mean_absolute_error(actual_spend_test,spend_pred_test))
146.02303336445783 194.7830521005078
#MAPE
print(sum(abs((actual_spend_train-spend_pred_train)/actual_spend_train))/len(actual_spend_train))
print(sum(abs((actual_spend_test-spend_pred_test)/actual_spend_test))/len(actual_spend_test))
0.35394614664763335 0.48337233134479834
# accuracy is improved after using Random forests
# variable importance(drivers) can be obtained through Random Forests but not positive and negative drivers seperately
rf = RandomForestRegressor(bootstrap=True,max_depth=11,max_features=6,n_estimators=100).fit(train_X,train_y)
# top 15 drivers
feature_imp_df = pd.DataFrame({'features': train_X.columns,'feature_importance':rf.feature_importances_}).sort_values(by=['feature_importance'],ascending=False)
feature_imp_df.iloc[:15,:]
| features | feature_importance | |
|---|---|---|
| 19 | income | 0.198614 |
| 7 | card | 0.184390 |
| 8 | card2 | 0.068584 |
| 18 | hourstv | 0.058121 |
| 20 | jobsat | 0.035700 |
| 36 | retire | 0.034069 |
| 14 | edcat | 0.031512 |
| 9 | card2benefit | 0.026926 |
| 33 | reason | 0.023303 |
| 10 | carown | 0.022859 |
| 29 | ownvcr | 0.020321 |
| 0 | bfast | 0.019532 |
| 23 | owndvd | 0.016955 |
| 16 | gender | 0.014496 |
| 38 | vote | 0.014409 |
# top 15 drivers
sns.barplot(x=feature_imp_df.iloc[:15,:].feature_importance,y=feature_imp_df.iloc[:15,:].features)
<matplotlib.axes._subplots.AxesSubplot at 0x23d1aeb0f60>